# NOTES ------------------------------------------------------------------------
#   - Sample from households with at least 12 bills
#   - Keep bills between 28 and 34 days
#   - Does not limit to 2010–2014 (PGE covers 2002–2014; SoCal covers 2010–2015)

# Description ------------------------------------------------------------------

# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = F)
  # Packages
  library(pacman)
  p_load(stringr, data.table, lfe, lubridate, foreach, doParallel, magrittr)
  # Directories
  dir_project <- "S:/Edward/Elasticity/"
  dir_csv     <- paste0(dir_project, "DataCsv/")
  dir_rds     <- paste0(dir_project, "DataR/")
  dir_pge     <- paste0(dir_rds, "Prices/PGE/")
  dir_socal   <- paste0(dir_rds, "Prices/SoCal/")
  dir_fwl     <- paste0(dir_rds, "Prices/FWL/")
  # More options
  # options(lfe.threads = 32)
  options(datatable.print.nrows = 10)
  # Set seed
  set.seed(12345)

# Define desired variables -----------------------------------------------------
  # Define variables that I want without lags
  var_nolag <- c(
    "begin_date", "end_date", "year", "ym",
    "zip", "zip_plus4", "zip9",
    "care", "care_pct",
    "hh_id", "hh_month",
    "climate", "rate_sch", "utility",
    "allowance", "rev", "pct_flags",
    NULL)
  # Define variables with which we want lags
  var_lags <- c(
    # "allowance",
    "thm", "thm_day",
    "days", "month",
    # paste0("t_p_", str_pad(1:14, 2, "left", 0)),
    "bill_hdd",
    "east_year_hdd", "ca_year_hdd",
    # "east_year_cdd", "ca_year_cdd",
    # "east_year_dd", "ca_year_dd",
    "total_bill",
    "bill_flag",
    # "care",
    "price_avg", "price_avg_mrg",
    "price_base", "price_base_ppps",
    # "price_excess", "price_excess_ppps",
    # "price_base_init", "price_excess_init",
    # "ppps_charge",
    "price_mrg", "price_mrg_ppps",
    "mrg_sim_iv_10_14", "mrg_sim_iv_11_13",
    "spot_price_1week",
    "spot_price_2week",
    "spot_price_3week",
    # "spot_price_1week_init", "spot_price_2week_init", "spot_price_3week_init",
    NULL)
  # Define lags and leads
  the_lags <- c(1:2) %>% paste0("_lag", .)
  # the_leads <- c(1) %>% paste0("_lead", .)
  # Defined all desired variables (including lags and leads)
  all_vars <- c(
    var_nolag, var_lags,
    paste0(rep(var_lags, each = length(the_lags)), the_lags),
    # paste0(rep(var_lags, each = length(the_leads)), the_leads),
    NULL
  )
  # Clean up a bit
  rm(var_nolag, var_lags, the_leads, the_lags)

# Function: Load data ----------------------------------------------------------
  zip_loader <- function(the_zip, the_utility, all_vars, dir_rds) {
    # Build the file name
    the_file <- paste0(dir_rds, "Prices/", the_utility, "/", "prices_",
      tolower(the_utility), "_", the_zip, ".rds")
    # Open the associated file
    bill_dt <- readRDS(the_file)
    # Keep only the desired variables (those in 'all_vars')
    bill_dt[, setdiff(names(bill_dt), all_vars) := NULL]
    # Return bill_dt
    return(bill_dt)
  }

# Function: Clean data ---------------------------------------------------------
  zip_cleaner <- function(bill_dt) {
    # Find the most extreme 1 percent of observations on therms
    thm_lo <- quantile(x = bill_dt[,thm], probs = 0.005)
    thm_hi <- quantile(x = bill_dt[, thm], probs = 0.995)
    # Find the most extreme 1 percent of observations on revenue
    rev_lo <- quantile(x = bill_dt[, rev], probs = 0.005)
    rev_hi <- quantile(x = bill_dt[, rev], probs = 0.995)
    # Flag observations outside of the central 99%
    bill_dt[, `:=`(
      flag_thm = 1 * ((thm > thm_hi) | (thm < thm_lo)),
      flag_rev = 1 * ((rev > rev_hi) | (rev < rev_lo))
      )]
    # Create 1st and 2nd lags of these flags
    setorder(bill_dt, hh_id, begin_date)
    bill_dt[, `:=`(
      flag_rev_lag1  = shift(flag_rev,  1L, fill = NA, type = "lag"),
      flag_thm_lag1  = shift(flag_thm,  1L, fill = NA, type = "lag"),
      flag_rev_lag2  = shift(flag_rev,  2L, fill = NA, type = "lag"),
      flag_thm_lag2  = shift(flag_thm,  2L, fill = NA, type = "lag")
    ), by = hh_id]
# NOTE: May want to move this step to analysis file
    # Keep bills with 28-34 days
    bill_dt <- bill_dt[between(days, 28, 34)]
    # Restrict to set of households with at least 12 bills
    bill_dt[, n_bills := .N, by = hh_id]
    # Drop households if more than 50% of their bills are flagged
    bill_dt <- bill_dt[pct_flags < 0.50]
    # Drop flagged bills
    bill_dt <- bill_dt[
      (flag_rev == 0) & (flag_rev_lag1 == 0) & (flag_rev_lag2 == 0) &
      (flag_thm == 0) & (flag_thm_lag1 == 0) & (flag_thm_lag2 == 0) &
      (thm_day > 0) & (price_mrg_lag2 > 0) & (price_base > 0)
      ]
    # Return bill_dt
    return(bill_dt)
  }

# Function: Add additional variables -------------------------------------------
  zip_adder <- function(bill_dt) {
    # Spatiotemporal controls
    bill_dt[, hh_year := paste0(hh_id, "_", year)]
    bill_dt[, zip_ym := paste0(zip, "_", ym)]
    bill_dt[, zip_ym_utility := paste0(zip_ym, utility)]
    bill_dt[, zip_year := paste0(zip, "_", year)]
    bill_dt[, week := week(begin_date)]
    bill_dt[, yw := paste0(year, str_pad(week, 2, "left", 0))]
    bill_dt[, zip_yw := paste0(zip, "_", yw)]
    # Change HDD and CDD variables to thousands
    vars_hdd <- bill_dt %>% names() %>% grep("hdd", ., value = T)
    vars_cdd <- bill_dt %>% names() %>% grep("cdd", ., value = T)
    for (j in c(vars_cdd, vars_hdd)) {
      set(x = bill_dt, j = j, value = bill_dt[[j]] / 1e3)
    }
    rm(j, vars_hdd, vars_cdd); gc()
    # Return bill_dt
    return(bill_dt)
  }

# Function: Run OLS regressions ------------------------------------------------
  zip_ols <- function(bill_dt, the_zip, the_utility, dir_fwl) {
    # Residualize log(thm_day), log(price_mrg), and log(price_base) on
    # bill HDDs, HH FEs, and month-of-sample FE
    resid_lfe <- felm(log(thm_day) | log(price_mrg_lag2) | log(price_base_lag2) ~
      bill_hdd | hh_id + ym,
      data = bill_dt, keepCX = T)$residuals %>% data.table()
    setnames(resid_lfe, c("log_thm_day", "log_price_mrg", "log_price_base"))
    # Run the FWL-based regressions
    mrg_lfe <- felm(log_thm_day ~ -1 + log_price_mrg, resid_lfe)
    base_lfe <- felm(log_thm_day ~ -1 + log_price_base, resid_lfe)
    # Save the coefficients
    saveRDS(
      object = list(
        mrg_b = mrg_lfe$coefficients[1,1],
        base_b = base_lfe$coefficients[1,1]),
      file = paste0(dir_fwl, "b/", tolower(the_utility), "FWL", the_zip, ".rds"))
    # Save the Xs
    saveRDS(
      object = list(
        mrg_x = mrg_lfe$X,
        base_x = base_lfe$X),
      file = paste0(dir_fwl, "x/", tolower(the_utility), "FWL", the_zip, ".rds"))
    # Save the X'X
    saveRDS(
      object = list(
        mrg_xx = sum(mrg_lfe$X^2),
        base_xx = sum(base_lfe$X^2)),
      file = paste0(dir_fwl, "xx/", tolower(the_utility), "FWL", the_zip, ".rds"))
    # Save the residuals
    saveRDS(
      object = list(
        mrg_u = mrg_lfe$residuals,
        base_u = base_lfe$residuals),
      file = paste0(dir_fwl, "u/", tolower(the_utility), "FWL", the_zip, ".rds"))
    # Save the number of observations
    saveRDS(
      object = list(
        mrg_n = mrg_lfe$X %>% nrow(),
        base_n = base_lfe$X %>% nrow()),
      file = paste0(dir_fwl, "n/", tolower(the_utility), "FWL", the_zip, ".rds"))
  }

# Function: Load, clean, and analyze given zip code ----------------------------
  # NOTE: the_utility' should be 'PGE' or 'SoCal'
  zip_runner <- function(the_zip, the_utility, all_vars, dir_rds, dir_fwl) {
    # Run the functions
    zip_loader(the_zip, the_utility, all_vars, dir_rds) %>%
      zip_cleaner() %>%
      zip_adder() %>%
      zip_ols(the_zip, the_utility, dir_fwl)
    # Garbage control
    invisible(gc())
  }

# Function: The meat estimator -------------------------------------------------
  zip_meat <- function(the_zip, the_utility, dir_fwl, b_pooled) {
    zip_name <- paste0(tolower(the_utility), "FWL", the_zip, ".rds")
    # Load the zip code's OLS data
    b_l <- paste0(dir_fwl, "b/", zip_name) %>% readRDS()
    x_l <- paste0(dir_fwl, "x/", zip_name) %>% readRDS()
    u_l <- paste0(dir_fwl, "u/", zip_name) %>% readRDS()
    # Calculate the correct (pooled) residuals
    e_l <- list(
      mrg_e  = x_l[[1]] * (b_l[[1]] - b_pooled[[1]]) + u_l[[1]],
      base_e = x_l[[2]] * (b_l[[2]] - b_pooled[[2]]) + u_l[[2]]
    )
    # The 'meat' estimator for the current zip code
    # NOTE: Math is easy here because we have Nx1 matrices
    meat_l <- list(
      mrg_meat  = sum(x_l[[1]] * e_l[[1]])^2,
      base_meat = sum(x_l[[2]] * e_l[[2]])^2
    )
    # Save the zip code's meat
    saveRDS(
      object = meat_l,
      file = paste0(dir_fwl, "meat/", zip_name))
    # Clean up
    rm(b_l, x_l, u_l, e_l, zip_name)
    # Garbage control
    invisible(gc())
  }

# Find the individual zip codes ------------------------------------------------
  # Find PGE's zip codes (for which we have data)
  pge_zips <- dir_pge %>% dir() %>%
    str_replace("prices_pge_", "") %>%
    str_replace(".rds", "") %>%
    grep("[a-z]", ., invert = T, value = T)
  # Repeat for SCG
  socal_zips <- dir_socal %>% dir() %>%
    str_replace("prices_socal_", "") %>%
    str_replace(".rds", "") %>%
    grep("[a-z]", ., invert = T, value = T)
  # Find the completed zip codes
  pge_done <- paste0(dir_fwl, "u") %>%
    dir() %>%
    str_subset("pge") %>%
    str_replace_all("[^0-9]", "")
  socal_done <- paste0(dir_fwl, "u") %>%
    dir() %>%
    str_subset("socal") %>%
    str_replace_all("[^0-9]", "")
  # Now find the unfinished zip codes
  pge_todo <- setdiff(pge_zips, pge_done)
  socal_todo <- setdiff(socal_zips, socal_done)

# # PGE: Regressions for each zip code -------------------------------------------
#   # Start cluster
#   cl <- makeCluster(6)
#   registerDoParallel(cl)
#   # Loop over the PG&E zip codes
#   foreach(the_zip = pge_todo,
#     .packages = c("data.table", "magrittr", "lubridate",
#       "stringr", "lfe")) %dopar% {
#         # Run the data prep and sampling functions for the zip code
#         zip_runner(
#           the_zip = the_zip,
#           the_utility = "PGE",
#           all_vars = all_vars,
#           dir_rds = dir_rds,
#           dir_fwl = dir_fwl)
#   }
#   # Kill cluster
#   stopCluster(cl)
#   # Clean up
#   gc()

# # SoCal: Regressions for each zip code -----------------------------------------
#   # Start cluster
#   cl <- makeCluster(6)
#   registerDoParallel(cl)
#   # Loop over the SoCal zip codes
#   foreach(the_zip = socal_todo,
#     .packages = c("data.table", "magrittr", "lubridate",
#       "stringr", "lfe")) %dopar% {
#         # Run the data prep and sampling functions for the zip code
#         zip_runner(
#           the_zip = the_zip,
#           the_utility = "SoCal",
#           all_vars = all_vars,
#           dir_rds = dir_rds,
#           dir_fwl = dir_fwl)
#   }
#   # Kill cluster
#   stopCluster(cl)
#   # Clean up
#   gc()

# Aggregate estimates ----------------------------------------------------------
  # Find the completed zip codes
  pge_done <- paste0(dir_fwl, "u") %>%
    dir() %>%
    str_subset("pge") %>%
    str_replace_all("[^0-9]", "")
  socal_done <- paste0(dir_fwl, "u") %>%
    dir() %>%
    str_subset("socal") %>%
    str_replace_all("[^0-9]", "")
  # Open the estimated coefficients
  b_pge_mrg <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "b/pgeFWL", ., ".rds") %>% readRDS() %$%mrg_b
  }) %>% unlist()
  b_socal_mrg <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "b/socalFWL", ., ".rds") %>% readRDS() %$%mrg_b
  }) %>% unlist()
  b_pge_base <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "b/pgeFWL", ., ".rds") %>% readRDS() %$%base_b
  }) %>% unlist()
  b_socal_base <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "b/socalFWL", ., ".rds") %>% readRDS() %$%base_b
  }) %>% unlist()
  # The centered SSR
  xx_pge_mrg <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "xx/pgeFWL", ., ".rds") %>% readRDS() %$%mrg_xx
  }) %>% unlist()
  xx_socal_mrg <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "xx/socalFWL", ., ".rds") %>% readRDS() %$%mrg_xx
  }) %>% unlist()
  xx_pge_base <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "xx/pgeFWL", ., ".rds") %>% readRDS() %$%base_xx
  }) %>% unlist()
  xx_socal_base <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "xx/socalFWL", ., ".rds") %>% readRDS() %$%base_xx
  }) %>% unlist()
  # Number of observations
  n_pge_mrg <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "n/pgeFWL", ., ".rds") %>% readRDS() %$%mrg_n
  }) %>% unlist()
  n_socal_mrg <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "n/socalFWL", ., ".rds") %>% readRDS() %$%mrg_n
  }) %>% unlist()
  n_pge_base <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "n/pgeFWL", ., ".rds") %>% readRDS() %$%base_n
  }) %>% unlist()
  n_socal_base <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "n/socalFWL", ., ".rds") %>% readRDS() %$%base_n
  }) %>% unlist()
  # Total observations by utility
  n_pge <- n_pge_mrg %>% sum()
  n_socal <- n_socal_mrg %>% sum()
  # The OLS weights estimator for marginal price
  b_mrg <- (sum(b_pge_mrg * xx_pge_mrg) +
    sum(b_socal_mrg * xx_socal_mrg)) /
    sum(xx_pge_mrg, xx_socal_mrg)
  # The OLS weights estimator for baseline price
  b_base <- (sum(b_pge_base * xx_pge_base) +
    sum(b_socal_base * xx_socal_base)) /
    sum(xx_pge_base, xx_socal_base)
  # Create a list of the two estimates
  b_pooled <- list(b_mrg = b_mrg, b_base = b_base)

# PGE: Meat estimator for clustered standard errors ----------------------------
  # Start cluster
  cl <- makeCluster(6)
  registerDoParallel(cl)
  # Loop over the PG&E zip codes
  foreach(the_zip = pge_done,
    .packages = c("data.table", "magrittr")) %dopar% {
      # Run the data prep and sampling functions for the zip code
      try(zip_meat(
        the_zip = the_zip,
        the_utility = "PGE",
        dir_fwl = dir_fwl,
        b_pooled = b_pooled
      ))
  }
  # Kill cluster
  stopCluster(cl)
  # Clean up
  gc()

# SoCal: Meat estimator for clustered standard errors --------------------------
  # Start cluster
  cl <- makeCluster(6)
  registerDoParallel(cl)
  # Loop over the SoCal zip codes
  foreach(the_zip = socal_done,
    .packages = c("data.table", "magrittr")) %dopar% {
      # Run the data prep and sampling functions for the zip code
      try(zip_meat(
        the_zip = the_zip,
        the_utility = "SoCal",
        dir_fwl = dir_fwl,
        b_pooled = b_pooled
      ))
  }
  # Kill cluster
  stopCluster(cl)
  # Clean up
  gc()

# Calculate clustered standard errors ------------------------------------------
  # Grab the meats
  meat_pge <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "meat/pgeFWL", ., ".rds") %>% readRDS()
  })
  meat_socal <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "meat/socalFWL", ., ".rds") %>% readRDS()
  })
  # Calculate the meats
  meat_mrg <- sum(lapply(X = meat_pge, extract2, 1) %>% unlist()) +
    sum(lapply(X = meat_socal, extract2, 1) %>% unlist())
  meat_base <- sum(lapply(X = meat_pge, extract2, 2) %>% unlist()) +
    sum(lapply(X = meat_socal, extract2, 2) %>% unlist())
  # Load the X'X
  xx_pge <- lapply(X = pge_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "xx/pgeFWL", ., ".rds") %>% readRDS()
  })
  xx_socal <- lapply(X = socal_done, FUN = function(x) {
    x %>% paste0(dir_fwl, "xx/socalFWL", ., ".rds") %>% readRDS()
  })
  # Calculate X'X
  xx_mrg <- sum(lapply(X = xx_pge, extract2, 1) %>% unlist()) +
    sum(lapply(X = xx_socal, extract2, 1) %>% unlist())
  xx_base <- sum(lapply(X = xx_pge, extract2, 2) %>% unlist()) +
    sum(lapply(X = xx_socal, extract2, 2) %>% unlist())
  # Variance and standard errors
  v_mrg <- meat_mrg / xx_mrg^2
  v_base <- meat_base / xx_base^2
  se_mrg <- sqrt(v_mrg)
  se_base <- sqrt(v_base)
